In [210]:
# Standard lib
import re
import pickle
from collections import OrderedDict
from datetime import datetime
# Third-party
from sqlalchemy import create_engine
import numpy as np
import matplotlib.pyplot as pl
%matplotlib inline
from sklearn.feature_extraction import text
from sklearn.utils.extmath import cartesian
import nltk
from nltk.stem.porter import PorterStemmer
import yaml
import pandas as pd
In [130]:
config_filename = "/Users/adrian/projects/aas-abstract-sorter/sql_login.yml"
with open(config_filename) as f:
config = yaml.load(f.read())
In [135]:
engine = create_engine('mysql+pymysql://{user}:{password}@{server}/{database}'.format(**config))
engine.connect()
_presentation_cache = dict()
In [195]:
query = """
SELECT session.so_id, presentation.title,
presentation.abstract, presentation.id
FROM session, presentation
WHERE session.meeting_code = 'aas227'
AND session.so_id = presentation.session_so_id
AND presentation.status IN ('Sessioned', '')
AND session.type IN (
'Oral Session'
, 'Special Session'
, 'Splinter Meeting'
)
ORDER BY presentation.id;
"""
result = engine.execute(query)
all_results = result.fetchall()
presentation_df = pd.DataFrame(all_results, columns=all_results[0].keys())
presentation_df['abstract'] = presentation_df['abstract'].str.replace('<[^<]+?>', '')
In [178]:
query = """
SELECT session.title, session.start_date_time, session.end_date_time, session.so_id
FROM session
WHERE session.meeting_code = 'aas227'
AND session.type IN (
'Oral Session'
, 'Special Session'
, 'Splinter Meeting'
)
ORDER BY session.so_id;
"""
result = engine.execute(query)
session_results = result.fetchall()
session_df = pd.DataFrame(session_results, columns=session_results[0].keys())
session_df['start_date_time'] = pd.to_datetime(session_df['start_date_time'])
session_df['end_date_time'] = pd.to_datetime(session_df['end_date_time'])
session_df = session_df[1:] # zero-th entry has a corrupt date
In [152]:
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
# remove non letters
text = re.sub("[^a-zA-Z]", " ", text)
# tokenize
tokens = nltk.word_tokenize(text)
# stem
stems = stem_tokens(tokens, stemmer)
return stems
vectorizer = text.CountVectorizer(
analyzer='word',
tokenizer=tokenize,
lowercase=True,
stop_words='english',
)
In [155]:
count_matrix = vectorizer.fit_transform(presentation_df['abstract']).toarray()
count_matrix.shape
Out[155]:
In [165]:
ten_most_common_idx = count_matrix.sum(axis=0).argsort()[::-1][:10]
feature_words = np.array(vectorizer.get_feature_names())
print(feature_words[ten_most_common_idx])
In [244]:
similiarity_matrix = np.zeros((count_matrix.shape[0],count_matrix.shape[0]))
for ix1 in range(count_matrix.shape[0]):
for ix2 in range(count_matrix.shape[0]):
num = count_matrix[ix1].dot(count_matrix[ix2])
denom = np.linalg.norm(count_matrix[ix1]) * np.linalg.norm(count_matrix[ix2])
if num < 1: # if no common words, the vectors are orthogonal
v = 0.
else:
v = num / denom
similiarity_matrix[ix1,ix2] = v
In [245]:
similiarity_matrix_1d = np.triu(similiarity_matrix).ravel()
top_ten = sorted(np.unique(similiarity_matrix_1d[~np.isclose(similiarity_matrix_1d,1.)]), reverse=True)[:10]
In [246]:
for ix1,ix2 in zip(list(ix[0]), list(ix[1])):
pres1 = get_presentation(presentation_ids[ix1])
pres2 = get_presentation(presentation_ids[ix2])
print(pres1['title'])
print(pres2['title'])
print()
Those seem pretty similar! Looks like the code is working...
For now, we'll start with the first day of conference talks, 5 Jan. We'll also only check for sessions that have the same start time (of course, we should really be looking at any overlapping sessions, but this is fine as a first pass...).
In [238]:
def session_similarity(so_id1, so_id2):
"""
Compute the similarity between two sessions by getting the sub-matrix of the
similarity matrix for all pairs of presentations from each session.
"""
presentations_session1 = presentation_df[presentation_df['so_id'] == so_id1]
presentations_session2 = presentation_df[presentation_df['so_id'] == so_id2]
if len(presentations_session1) == 0 or len(presentations_session2) == 0:
# no presentations in session
return np.array([])
index_pairs = cartesian((presentations_session1.index,presentations_session2.index)).T
sub_matrix = similiarity_matrix[(index_pairs[0],index_pairs[1])]
shape = (len(presentations_session1), len(presentations_session2))
sub_matrix = sub_matrix.reshape(shape)
return sub_matrix
In [262]:
for name,group in session_df[session_df['start_date_time'] >= datetime(2016, 1, 5)].groupby('start_date_time'):
for title1,so_id1 in zip(group['title'],group['so_id']):
for title2,so_id2 in zip(group['title'],group['so_id']):
if so_id1 >= so_id2: continue
scores = session_similarity(so_id1, so_id2)
if len(scores) == 0: # no presentations in one of the sessions
continue
if scores.max() > 0.5: # totally arbitrary threshold
print(title1)
print(title2)
print(scores.max(), np.median(scores))
print()
These are sessions that were scheduled for the same time-slot that have two talks with significant overlap between their abstracts.
In [ ]: